home *** CD-ROM | disk | FTP | other *** search
- /*---------------------------------------------------------------
- Copyright 1995, Steve Israelson
-
- I own this code. You are free to use this code in any software
- you want. You may not sell this source code at all, you can
- sell your product though. If you want to include this code
- in any code collection (CD-Roms etc) this is OK as long as
- I get a complimentary copy.
- Steve.
-
- Regular expression matching. The RegExp::Parse() function will
- create a new regular expression object based on your input string.
- You can then ask this object if it matches a string and it will
- return true or false. Groups of these objects can perform miracles.
-
- These are the regular expression meta characters. I have no text
- defining what the standard characters are, so I made these up from
- memory. You can add more if you want, its easy.
-
- ^ Begining of line.
- $ End of line.
- [] Set.
- a-z A range of characters in a set.
- ~ The following characters are not in the set.
- * 0 or more of the previous pattern.
- + 1 or more of the previous pattern.
- . Any character.
- | Or. Used between any two patterns. NOT IMPLEMENTED!!!!!
- & Parameter. The previous pattern is a parameter.
- / The next character is a literal.
- All other characters are literals.
-
- Note: This parser is based on the ideas presented in the DrDobbs
- Sourcebook magazine July/August 1995 issue. The article is by
- Todd D. Esposito and Andrew K. Johnson. Before reading this
- article and entering and debugging their code, I had very little
- experience doing a scripting system. The concepts behind
- their system are more powerful than the ones I came up with.
- The concept behind this code is almost the same as the concepts
- they presented, but this implementation is more complete, and
- thus more usable for end users.
- ---------------------------------------------------------------*/
-
- #include "StevesRgExp.h"
- #include <String.h>
-
- /*---------------------------------------------------------------
- Creates a linked list of regular expressions representing
- the text. The root of the list will be returned.
- Pass in the text containing the expression.
- owner is used internally, so pass in nil.
- Also pass in the ID for this expression so you
- can figure out what expression matched the text.
- ---------------------------------------------------------------*/
- RegExp *RegExp::Parse(char *text, RegExp *owner, long exprID)
- {
- // parse the text and determine what type of expression
- // is in it. Make that type of reg exp object and
- // continue until the text is exhausted
- RegExp *theExpression = nil;
-
- switch (*text++)
- {
- case '^': // beginning of line
- theExpression = new RBeginLine(text, exprID);
- break;
- case '$': // end of line
- theExpression = new REndLine(text, exprID);
- break;
- case '[': // set
- theExpression = new RSetExpr(text, exprID);
- break;
- case '*': // zero or more
- if (owner)
- owner->type = kReg_ZeroOrMore;
- return Parse(text, owner, exprID);
- break;
- case '+': // one or more
- if (owner)
- owner->type = kReg_OneOrMore;
- return Parse(text, owner, exprID);
- break;
- case '.': // any char
- theExpression = new RAnyChar(text, exprID);
- break;
- case '|': // or
- theExpression = new ROrExpr(text, exprID);
- break;
- case '&': // previous was a parameter
- if (owner)
- owner->parameter = true;
- return Parse(text, owner, exprID);
- break;
- case '/': // literalize next char, handled in RLiteral
- default: // literal
- theExpression = new RLiteral(text - 1, exprID);
- break;
- case 0: // end of text, do nothing
- break;
- }
- return theExpression;
- }
-
- /*---------------------------------------------------------------
- Construct a regular expression based on some text.
- You MUST call next = Parse(text, this, newID);
- where text points to the characters that are left after
- you made your expression.
- ---------------------------------------------------------------*/
- RegExp::RegExp(long newID)
- {
- ID = newID;
- next = nil;
- type = kReg_Once; // default type
- parameter = false;
- }
-
- /*---------------------------------------------------------------
- Toast this object, but toast the next one first.
- ---------------------------------------------------------------*/
- RegExp::~RegExp()
- {
- if (next)
- delete next;
- }
-
- /*---------------------------------------------------------------
- Match the regular expression with some text.
- Pass in the text to match, the position of starting character
- to begin matching on, or 0 for the first character. Pass
- in a pointer to a short if you want to know the position
- of the next un-matched character, or nil if you don't.
- Pass in a list to hold the parameters, or nil if you
- don't want any parameters back.
- If we match, then the next expression in our list is tried.
- If the next one fails, then we try to match again, until
- we fail or the match succeeds. MatchOne() internally uses
- the nextChar variable to keep track of how many characters
- its matched, and MUST set it to the next character that
- was un-matched. The very first time you are called it will
- be -1.
- ---------------------------------------------------------------*/
- Boolean RegExp::Match(char *text, short start, short *last, LList *paramList)
- {
- short nextChar = -1;
-
- while (1)
- {
- // can we match our own criteria?
- if (!MatchOne(text, start, &nextChar))
- return false;
- // save the position of the last match
- if (!next && last)
- *last = nextChar;
- // can our sub expressions match?
- if (!next || next->Match(text, nextChar, last, paramList))
- {
- // if we have a parameter, then put it in the params here
- if (parameter && paramList)
- {
- char *param = new char[64]; // paramters default to this size, but you could dynamically do it
- strncpy(param, text + start, nextChar - start);
- param[nextChar - start] = 0; // terminate the string, does strncpy?
- paramList->InsertItemsAt(1, arrayIndex_First, ¶m); // add it to the front of the list
- }
- return true;
- }
- }
- return false;
- }
-
- /*---------------------------------------------------------------
- Match the regular expression with some text.
- Over-ride and return true if you match.
- The value of end will be preserved between calls, and
- will always start with -1. Start is the index of the first
- character to be considered.
- ---------------------------------------------------------------*/
- Boolean RegExp::MatchOne(char *text, short start, short *end)
- {
- *end = start;
- return false;
- }
-
- /*---------------------------------------------------------------
- Construct a regular expression based on some text.
- ---------------------------------------------------------------*/
- RBeginLine::RBeginLine(char *text, long newID) : RegExp(newID)
- {
- // nothing to do, make the next one in our list.
- next = Parse(text, this, newID);
- }
-
- /*---------------------------------------------------------------
- Match the regular expression with some text.
- ---------------------------------------------------------------*/
- Boolean RBeginLine::MatchOne(char *text, short start, short *end)
- {
- if (*end == -1 && !start) // we only match if we are at the start of the line, ie start = 0
- {
- *end = start;
- return true;
- }
- return false;
- }
-
- /*---------------------------------------------------------------
- Construct a regular expression based on some text.
- ---------------------------------------------------------------*/
- REndLine::REndLine(char *text, long newID) : RegExp(newID)
- {
- // nothing to do, make the next one in our list.
- next = Parse(text, this, newID);
- }
-
- /*---------------------------------------------------------------
- Match the regular expression with some text.
- ---------------------------------------------------------------*/
- Boolean REndLine::MatchOne(char *text, short start, short *end)
- {
- // we only match if there are no more characters left in this line
- if (*end == -1 && (text[start] == 0 || text[start] == '\r'))
- {
- *end = start;
- return true;
- }
- return false;
- }
-
- /*---------------------------------------------------------------
- Construct a regular expression based on some text.
- This makes an expression that can match a set. Simply
- uses an array of booleans to keep track of which
- characters are in the set. Could be better, but...
- The text should be "[...]" where ... can be any individual
- characters. you can also specify a range with the '-' character.
- Use '~' when you want to remove some chars from the set.
- [a-zA-Z~dD] matches all alphabetical chars except d and D
- ---------------------------------------------------------------*/
- RSetExpr::RSetExpr(char *text, long newID) : RegExp(newID)
- {
- // remove the set from the text
- for (int x = 0; x < 256; ++x)
- charSet[x] = 0;
- char state = 1;
- char prevChar = 0;
- while (*text && *text != ']')
- {
- if (*text == '/') // quote the next character, ie the ']', or the '/'
- ++text;
- if (*text == '-' && prevChar && *(text + 1)) // set a whole range
- {
- ++text;
- for (int x = prevChar; x <= *text; ++x)
- charSet[x] = state;
- }
- else if (*text == '~')
- state = 0;
- else
- charSet[*text] = state;
- prevChar = *text;
- ++text; // next character
- }
- if (*text) // skip the ']'
- ++text;
-
- // make the next one in our list.
- next = Parse(text, this, newID);
- }
-
- /*---------------------------------------------------------------
- Match the regular expression with some text.
- ---------------------------------------------------------------*/
- Boolean RSetExpr::MatchOne(char *text, short start, short *end)
- {
- if (type == kReg_Once && *end != -1) // end is -1 the first time, so if it is not 0 then...
- return false;
- if (type == kReg_ZeroOrMore && *end == -1)// we first try matching 0
- {
- *end = start;
- return true;
- }
- if (*end == -1) // the first time through, try only the first char
- *end = start;
- for (int x = start; x <= *end; ++x)
- if (!charSet[text[x]])
- return false;
- *end = *end + 1; // we matched this char, so move end.
- return true;
- }
-
- /*---------------------------------------------------------------
- Construct a regular expression based on some text.
- ---------------------------------------------------------------*/
- RAnyChar::RAnyChar(char *text, long newID) : RegExp(newID)
- {
- // make the next one in our list.
- next = Parse(text, this, newID);
- }
-
- /*---------------------------------------------------------------
- Match the regular expression with some text.
- ---------------------------------------------------------------*/
- Boolean RAnyChar::MatchOne(char *text, short start, short *end)
- {
- if (type == kReg_Once && *end != -1) // end is -1 the first time, so if it is not 0 then...
- return false;
- if (type == kReg_ZeroOrMore && *end == -1)// we first try matching 0
- {
- *end = start;
- return true;
- }
- if (*end == -1) // the first time through, try only the first char
- *end = start;
- // since we match anything, we do not need to make any checks here
- // EXCEPT to see if we are at the end of the string
- if (!text[*end])
- return false; // no more chars to match
- *end = *end + 1; // we matched this char, so move end.
- return true;
- }
-
- /*---------------------------------------------------------------
- Construct a regular expression based on some text.
- ---------------------------------------------------------------*/
- ROrExpr::ROrExpr(char *text, long newID) : RegExp(newID)
- {
- // NOT IMPLEMENTED yet
- // make the next one in our list.
- next = Parse(text, this, newID);
- }
-
- /*---------------------------------------------------------------
- Match the regular expression with some text.
- ---------------------------------------------------------------*/
- Boolean ROrExpr::MatchOne(char *text, short start, short *end)
- {
- return false; // We do not implement OR yet (how would we?)
- }
-
- /*---------------------------------------------------------------
- Construct a regular expression based on some text.
- Collect characters until a meta character is encountered.
- This is our literal.
- ---------------------------------------------------------------*/
- RLiteral::RLiteral(char *text, long newID) : RegExp(newID)
- {
- Boolean done = false;
- short index = 0;
-
- while (!done)
- {
- switch (*text)
- {
- case 0:
- case '^': // beginning of line
- case '$': // end of line
- case '[': // set
- case '*': // zero or more
- case '+': // one or more
- case '.': // any char
- case '|': // or
- done = true;
- break;
- case '/': // literalize next char
- ++text; // skip the slash and drop into the literal code
- default: // literal
- buffer[index++] = *text++;
- break;
- }
- }
- buffer[index] = 0; // terminate string
- next = Parse(text, this, newID);
- }
-
- /*---------------------------------------------------------------
- Match the regular expression with some text.
- Match the literal possible 0 or more times.
- ---------------------------------------------------------------*/
- Boolean RLiteral::MatchOne(char *text, short start, short *end)
- {
- short x;
- if (type == kReg_Once && *end != -1) // end is -1 the first time, so if it is not 0 then...
- return false;
- if (type == kReg_ZeroOrMore && *end == -1) // we first try matching 0
- {
- *end = start;
- return true;
- }
- if (*end == -1) // the first time through, try only the first set
- *end = start;
- for (x = start; x <= *end;)
- {
- short i = 0;
- while (buffer[i]) // match the entire buffer, return false if we hit the end of the string
- if (!text[x] || (buffer[i++] != text[x++]))
- return false;
- }
- *end = x; // the end has moved
- return true;
- }
-
-
-
-
-
-
-
-
-
-
-
-
-
-